***********************************************************************************
* Merging NEW-OWES-ONET-SOC-CEN2010-IPUMS with CPS data
* last modified 7/17/2025
*
* Note: Need to aggregate SOC in NEW-OWES-ONET-SOC-CEN2010-IPUMS into CEN2010 in CPS
*
***********************************************************************************


clear
capture log close 
set more off

***************
* Directories *
***************

cd "/Users/"

global dofile			=	"./Code/NEW-OEWS"
global matrix_data		=   "./Data/National Employment Matrix"
global oews_data		= 	"./Data/OEWS/all_excel"
global onet_data		= 	"./OccLink/ONET/ONET data/merged"
global cps_data			= 	"./Data/CPS/IPUMS/00015"
global xwalk 			=	"./Data/Crosswalk"
global results 			=	"./Results"
global temp				= 	"./Temp"



********************************
* Clean CPS data               *
********************************

use "${cps_data}/cps_00015.dta", replace

** generate a unique person id **
tostring pernum, gen(per) format(%02.0f)
egen personid = concat(year serial per)

** generate an immigrant variable **
recode citizen (1 2 3 = 1 "US born") (4 = 2 "Naturalized") (5 = 3 "non-citizen"), gen(immigrant)

** generate a marital status variable **
recode marst (1 2 = 1 "married") (3 4 5 = 2 "separated/divorced/widowed") (6 = 3 "single/never married"), gen(marital)

** generate a race variable **
recode hispan (0 901 902 = 0 "Not") (100/612 = 1 "Hispanic"), gen(hispanic)
rename race raceX
recode raceX (100 = 1 "White") (200 = 2 "Black") (else = 3 "Asians and Others"), gen(race)

** generate employment status variable
recode empstat (0=0 "NIU, ages 0-14") (1/12=1 "employed") (21/22=2 "unemployed") (30/36=3 "Not in labor force"), gen(empstatus)

** generate education group variable
recode educ (2/72 = 1 "Below HS") (73 = 2 "HS") (80/110 = 3 "Some college") (111/125 = 4 "BA or above") (else = .), gen(edu_gr)

** generate age group variable
recode age (15/35 = 1 "15-35") (36/55 = 2 "36-56") (56/85= 3 "56+"), gen(age_gr)

** generate sex variables
recode sex (1=1 "male") (2=0 "female"), gen(male)

** generate a weight variable **
clonevar weight = asecwth



rename occ2010 occ_t1
rename occ10ly occ_t0

save "${temp}/cps_2000-2020_temp.dta", replace



********************************
* Merge OCC data with CPS data *
********************************
/* prepare origin occ data */
use "${temp}/oews_matrix_ooh2000-2020_ipums_occ.dta", replace

rename year matchyear
rename ipums2010 occ_t0

foreach var of varlist empsize projsize proj_pc_growth annual_mean annual_pct10 annual_pct90 emp_chng_rate emp_pc_growth outlook outlook_gr projjobopen {
	rename `var' `var'_t0
}

save "${temp}/origin_occ.dta", replace

/* prepare destination occ data */
use "${temp}/oews_matrix_ooh2000-2020_ipums_occ.dta", replace
rename year matchyear
rename ipums2010 occ_t1

foreach var of varlist empsize projsize proj_pc_growth annual_mean annual_pct10 annual_pct90 emp_chng_rate emp_pc_growth outlook outlook_gr projjobopen {
	rename `var' `var'_t1
}

save "${temp}/destination_occ.dta", replace


/* merge occ with CPS */
use "${temp}/cps_2000-2020_temp.dta", replace

* generate match year
gen 	 matchyear = year   if inlist(year, 2000, 2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2019, 2020)
* replace  matchyear = year-1 if inlist(year, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015, 2017)


merge m:1 matchyear occ_t0 using "${temp}/origin_occ.dta", keep(1 3) nogenerate
merge m:1 matchyear occ_t1 using "${temp}/destination_occ.dta", keep(1 3) nogenerate


/* adjust inflation, in 2020 dollars january to january */
foreach var of newlist annual_mean annual_pct10 annual_pct90 {
	foreach time of newlist t0 t1 {
	
	gen adj_`var'_`time' = `var'_`time' * 1.53 if year == 2000
	replace adj_`var'_`time' = `var'_`time' * 1.47 if year == 2001
	replace adj_`var'_`time' = `var'_`time' * 1.46 if year == 2002
	replace adj_`var'_`time' = `var'_`time' * 1.42 if year == 2003
	replace adj_`var'_`time' = `var'_`time' * 1.39 if year == 2004
	replace adj_`var'_`time' = `var'_`time' * 1.35 if year == 2005
	replace adj_`var'_`time' = `var'_`time' * 1.30 if year == 2006
	replace adj_`var'_`time' = `var'_`time' * 1.27 if year == 2007
	replace adj_`var'_`time' = `var'_`time' * 1.22 if year == 2008
	replace adj_`var'_`time' = `var'_`time' * 1.22 if year == 2009
	replace adj_`var'_`time' = `var'_`time' * 1.19 if year == 2010
	replace adj_`var'_`time' = `var'_`time' * 1.17 if year == 2011
	replace adj_`var'_`time' = `var'_`time' * 1.14 if year == 2012
	replace adj_`var'_`time' = `var'_`time' * 1.12 if year == 2013
	replace adj_`var'_`time' = `var'_`time' * 1.10 if year == 2014
	replace adj_`var'_`time' = `var'_`time' * 1.10 if year == 2015
	replace adj_`var'_`time' = `var'_`time' * 1.09 if year == 2016
	replace adj_`var'_`time' = `var'_`time' * 1.06 if year == 2017
	replace adj_`var'_`time' = `var'_`time' * 1.04 if year == 2018
	replace adj_`var'_`time' = `var'_`time' * 1.02 if year == 2019
	replace adj_`var'_`time' = `var'_`time' * 1.00 if year == 2020	
	}
	
}



keep year personid age sex race hispanic immigrant marital empstatus edu_gr age_gr male ///
	 occ_t0 empsize_t0 projsize_t0 proj_pc_growth_t0 adj_annual_mean_t0 adj_annual_pct10_t0 adj_annual_pct90_t0 emp_chng_rate_t0 emp_pc_growth_t0 outlook_t0 outlook_gr_t0  ///
	 occ_t1 empsize_t1 projsize_t1 proj_pc_growth_t1 adj_annual_mean_t1 adj_annual_pct10_t1 adj_annual_pct90_t1 emp_chng_rate_t1 emp_pc_growth_t1 outlook_t1 outlook_gr_t1 ///
	 weight ///
	 projjobopen_t0 projjobopen_t1


********************************
* Mark Sample                  *
********************************

/* keep age in 16-65 */
keep if inrange(age, 16, 65)

/* keep years with available OOH data */
keep if inlist(year, 2000, 2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)


save "${temp}/cps_2000-2020_wide.dta", replace


***********************************************************
* Reshape the data for discrete choice models for movers  *
***********************************************************

use "${temp}/cps_2000-2020_wide.dta", replace

/* Keep movers */
keep if occ_t0 != occ_t1

/* Keep civilian, employed population */
drop if occ_t0 >= 9800 | occ_t1 >= 9800

save "${temp}/cps_2000-2020_wide_movers.dta", replace

/* Merge choice set data with individual data */
tempfile occ_set cps_occ_set 
use "${temp}/oews_matrix_ooh2000-2020_ipums_occ.dta"

rename ipums2010 choice_occ

keep year choice_occ

save "`occ_set'"

use  "${temp}/cps_2000-2020_wide_movers.dta", replace

gen choice_occ = occ_t1

append using "`occ_set'"

keep personid choice_occ

fillin personid choice_occ

drop if personid == ""

save "`cps_occ_set'"

use "`cps_occ_set'"

merge n:1 personid using "${temp}/cps_2000-2020_wide_movers.dta", keep(1 2 3)

drop _merge

save "${temp}/cps_2000-2020_choice_set_movers.dta", replace

/* Merge choice set occupation characteristics with individual data */

use "${temp}/oews_matrix_ooh2000-2020_ipums_occ.dta", replace

rename ipums2010 choice_occ

foreach var of varlist empsize projsize proj_pc_growth annual_mean annual_pct10 annual_pct90 emp_chng_rate emp_pc_growth outlook outlook_gr projjobopen {
	rename `var' `var'_choice
}

merge 1:n year choice_occ using "${temp}/cps_2000-2020_choice_set_movers.dta", keep(1 2 3)

drop if personid == ""

save "${temp}/cps_2000-2020_choice_set_movers.dta", replace
